Explore Topic Modeling

Code
library(topicmodels)
library(tidyverse)
library(tidygraph)
library(ggraph)
library(arrow)
library(tidytext)

Load Data

set up and load data
gtr_desc = read_parquet("data/gtr.parquet") |> 
  select(id, abstractText)
gtr_meta = read_csv("data/projectsearch-1709481069771.csv") |> 
  mutate(across(ends_with("Date"), ~as.Date(.x,"%d/%m/%Y"))) |> 
  rename(id=ProjectId)
PARTIAL_PTTN='(/[1-9])$'

## waggle some columns for analytics
gtr_pj = gtr_meta |>
  mutate(
    is_partial = str_detect(ProjectReference, PARTIAL_PTTN),
    project_ref = str_replace(ProjectReference,PARTIAL_PTTN,""),
    part = str_extract(ProjectReference, PARTIAL_PTTN) |> 
      str_extract("\\d+") |> 
      as.numeric() |> 
      coalesce(0)
  ) |> 
  # filter(is_partial) |> 
  group_by(project_ref) |>
  mutate(occurance = n()) |> 
  ungroup() |> 
  dplyr::relocate(ProjectReference, FundingOrgName, LeadROName, id, 
           is_partial,project_ref,part,occurance)

## early stop if this is no longer true
stopifnot(
  "Project Reference is NOT Unique!"=length(unique(gtr_pj$ProjectReference)) == nrow(gtr_pj),
  "Project Refrence contain NA!"=!any(is.na(gtr_pj$ProjectReference))
)

## find out about 
unique_prj = gtr_pj |>
  relocate(ProjectReference, project_ref, id) |> 
  group_by(project_ref) |> 
  mutate(rn=row_number()) |> 
  filter(rn==1) |> 
  select(-rn) |> 
  ungroup()

## find out with text other than continuous project are repeated
repeated_text = gtr_desc |> 
  group_by(abstractText) |> 
  mutate(n=n()) |> 
  filter(n!=1) |>
  arrange(abstractText)

## Take the repeated test one out for now.
analysis_prj = unique_prj |> 
  anti_join(repeated_text, by="id") |> 
  mutate(year = lubridate::year(StartDate)) |>
  inner_join(gtr_desc, by="id")

N Gram Exploration

Code
## break into bi-grame
abstract_words = analysis_prj |>
  unnest_tokens(word, abstractText, "ngrams",n=2, drop=T) |> 
  count(word,id, sort=T)

## try this `bind_tf_idf` function
word_distinct = abstract_words |> 
  bind_tf_idf(word,id, n)

## tf_idf is good for sceope out distinct words about something?
word_distinct |> 
  arrange(desc(tf_idf))

Review Abstract Word Count of Abstract

Code
analysis_prj |> 
  count(ProjectCategory,sort=T)
Code
## word count histogram
abstract_words = analysis_prj |> 
  unnest_tokens(word,abstractText) |> 
  anti_join(stop_words)
Joining with `by = join_by(word)`
Code
abstract_words |> 
  count(id,ProjectCategory) |> 
  ggplot() +
  geom_histogram( fill="midnightblue",aes(x=n) ) +
  theme_minimal() +
  ggtitle("Abstract Length") +
  facet_wrap(~ ProjectCategory)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Research Grant and Studentship have “rich” abstract we can extract on. You probably what to filter down to Research Grant before modeling

Topic Modeling using LDA

Code
abstract_word_count = abstract_words |> 
  count(word,id,sort=T)

## this is very expensive process so cached this result for saving rendering time
if(interactive()) {
  gtr_dtm = abstract_word_count |> 
    cast_dtm(id,word,n)
  topics = gtr_dtm |> topicmodels::LDA(k=10)
  saveRDS(topics, "cache/05-topics.rds")
} else {
  topics=readRDS("cache/05-topics.rds")
}
Code
topics |> 
  tidy("beta") |> 
  group_by(topic) |> 
  slice_max(beta,n=15) |> 
  ggplot(aes(y=reorder_within(term, beta,topic),x= beta )) +
  geom_col() + 
  scale_y_reordered() +
  facet_wrap(~ topic,scales="free_y") +
  theme_minimal() +
  xlab("beta (higher indicate more relevant to topic)") +
  ylab("term") +
  ggtitle("LDA model output result (beta)")

This is a typical graph used to visualize LDA output. beta indicate importance of words to extracted topic.

However, it is still not easy to tell what these topic actually are. We can use n-gram to link most frequently linked words, which will give us a picture of how “these key words” links together.

Code
## examples of top 10 topics.
top10_gamma = topics |> 
  tidy("gamma") |> 
  group_by(topic) |> 
  slice_max(gamma) |> 
  rename(id=document)
top10_gamma |> left_join(gtr_desc,by="id")

Topic against Project Category

Code
## gamma score over the years
## top gamma's n-gram
topic_binded = topics |> 
  tidy("gamma") |> 
  filter(gamma > 0.8) |> # 0.8 is a number we can be sure there is only one topic
                         # for one document
  rename(id=document) |> 
  left_join(analysis_prj, "id")

topic_binded |>
  select(topic, ProjectCategory) |>
  mutate(topic_chr=fct_inorder(as.character(topic)) ) |> 
  complete(topic_chr,ProjectCategory) |> 
  ggplot() + 
  geom_bin2d(aes(x=topic_chr, y = ProjectCategory)) +
  theme_minimal() +
  scale_fill_viridis_c(option = "F",trans="sqrt") +
  ggtitle("Topic Against Porject Category") +
  coord_equal()

useful function create specific plot
## For compute bigram ---------------------------------------------
#' Calculate bigram given document and field
#' @param doc_df document number
#' @param field column of text field
#' @param doc_id id indicating text comes from in fact
compute_bi_gram = function(doc_df,field) {
  .gvars = group_vars(doc_df)
  if(length(.gvars)==0) {
    Gvars=quo(NULL)
  } else {
    Gvars=quo({any_of(.gvars)})
  }
  doc_df |> 
    # select({{field}},!!Gvars ) |> 
    unnest_tokens(pharse,{{field}},'ngrams',n=2) |> 
    separate(pharse, into=c("word1","word2"),sep=" ") |> 
    ## clear up stopwords
    anti_join(stop_words, c("word1"="word")) |> 
    anti_join(stop_words, c("word2"="word")) |> 
    filter(!is.na(word1) & !is.na(word2))
}
#' customer counting function that also
count_bi_gram = function(bi_gram,...) {
  bi_gram |> 
    group_by(word1,word2, .add=T) |> 
    group_by(..., .add=T) |> 
    summarise(n=n(),.groups="drop") |> 
    arrange(desc(n))
}
## plot functions --------------------------------
plot_beta=function(lda_model,ki,max_n=15) {
  lda_model |> 
    tidy("beta") |> 
    filter(topic == ki) |> 
    group_by(topic) |> 
    slice_max(beta,n=max_n) |> 
    ggplot(aes(y=reorder_within(term, beta,topic),x= beta )) +
    geom_col(fill="royalblue3") + 
    scale_y_reordered() +
    theme_minimal()
}
plot_word_graph=function(bi_gram_tokens,
                         ki,
                         top_n=15,
                         model=NULL,
                         color= "cyan4"
                         ) {
  if(!is.null(model)) {
    beta=tidy(model,"beta") |> 
      filter(topic == ki)
    .add_beta = function(x) {
      activate(x,nodes) |> 
        left_join(beta, by=c("name"="term"))
    }
    .add_node_marker = function() {
      geom_node_point(aes(size=beta))
    }
  } else {
    .add_beta = \(x) x
    .add_node_marker = \() geom_node_point(size=5)
  }
  typical_graph=bi_gram_tokens |> 
    count_bi_gram(topic) |> 
    relocate(word1,word2) |> 
    as_tbl_graph()
  ## filter bi-gram graph and plot
  g = typical_graph |> 
    .add_beta() |> 
    activate(edges) |>
    filter(topic == ki) |> 
    arrange(desc(n)) |> 
    filter(row_number() <= top_n) |> 
    activate(nodes) |> 
    filter(!node_is_isolated())
  ## plot one single graph
  g |> 
    ggraph('kk') +
    geom_edge_link(aes(width=n, alpha=n),color= color) +
    geom_node_text(aes(label=name),repel = T) +
    .add_node_marker() +
    theme_void() +
    ggtitle(paste("topic",ki))
}
pre compute expensive data
## data ----------------------
## bind topic result 
topic_binded=topics |> 
  tidy("gamma") |> 
  filter(gamma > 0.8) |> # 0.8 is a number we can be sure there is only one topic
                         # for one document
  rename(id=document) |> 
  left_join(analysis_prj, "id")

typical_docs = topic_binded |> 
  group_by(topic) |> 
  slice_max(gamma,n=1) |> 
  select(topic,abstractText)

## compute a graphic exploration
## combine to bi-gram this is usually the expensive one
bi_gram_tok = topic_binded |>
  compute_bi_gram(abstractText)
## graph -----------------------
plot_topics = function(model,reps_tk,reps_docs,topic_ki,color= "cyan4") {
  ## topic graph
  g1 = reps_tk |>
    plot_word_graph(topic_ki,top_n = 20,model=model,color=color) +
    ggtitle(paste0("Topic", topic_ki,""))
  
  ## beta plot
  g2 = model |> 
    plot_beta(topic_ki,max_n = 20) +
    ggtitle("Beta, words most represent topics")
  
  ## add example texts
  foot_notes = reps_docs |> 
    filter(topic == topic_ki) |> 
    pull(abstractText) |> 
    stringr::str_wrap(160) |> 
    stringr::str_trunc(500)
  g3 = ggplot() + theme_void() + geom_text(aes(x=0,y=0,label = foot_notes)) +
    ggtitle("Example")
  ggpubr::ggarrange(g1,g2) |> 
    ggpubr::ggarrange(g3,nrow=2, heights=c(8,5))
}

The topic modeling are in fact picking up different types of topics. Particularly “Studentship” is extracted as topic 2, “Collaborative R & D” is almost exclusively extracted as topic 10.

Code
G=map(c(2,10),~plot_topics(model=topics,
                        reps_tk=bi_gram_tok, 
                        reps_docs=typical_docs,
                        topic_ki=.x))
Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.
Code
ggpubr::ggarrange(plotlist=G,nrow=2)

Topic 3 seems to be describe about “machine learning”. One of the hot words. It makes no surprise that some how related to student ship.

topic 8, topic 9 both are very scientific. As we can see the description are very sentific and specific to particular field.

Code
G=map(c(8,9),~plot_topics(model=topics,
                        reps_tk=bi_gram_tok, 
                        reps_docs=typical_docs,
                        topic_ki=.x))
ggpubr::ggarrange(plotlist=G,nrow=2)

topic 8 probably sounds like biology. topic 9 sounds like material science.

It also interesting to see that although term “artificial intelligence” is frequently linked together, the “beta” score is actually low.

Topic Against Institution

Code
topics |> 
  tidy("gamma") |> 
  filter(gamma > 0.75) |> 
  left_join(gtr_desc,c("document"="id"))

Focus on Research Grant

The histogram shows us that research grant is the biggest chunk. So let’s apply “research grant” by itself to see if there anything interesting.

Code
## filter to research project only
res_prj = analysis_prj |> 
  filter(ProjectCategory=="Research Grant")

## research document term matrix
res_dtm = res_prj |> 
  select(id,abstractText) |> 
  unnest_tokens(word,abstractText) |> 
  anti_join(stop_words) |> 
  count(id,word,sort=T) |> 
  cast_dtm(id,word,n)
Joining with `by = join_by(word)`
Code
## research lda models 
if(interactive() ) {
  res_lda = res_dtm |> topicmodels::LDA(k=10)
  saveRDS(res_lda,"cache/05-res_lda.rds")
} else {
  res_lda=readRDS("cache/05-res_lda.rds")
}
lda output for research fund abstract text only
res_reps = res_lda |> 
  tidy("gamma") |> 
  filter(gamma > 0.75) |> # 0.75 is a number we can be sure there is only one topic
                         # for one document
  rename(id=document) |> 
  left_join(analysis_prj, "id")

## compute three data sets
res_betas = res_lda |> tidy("beta")
res_bi_gram = res_reps |> 
  compute_bi_gram(abstractText)
res_reps_top = res_reps |> 
  group_by(topic) |> 
  slice_max(gamma,n=1)
Code
## a simple test of co-herence
res_reps |> 
  group_by(topic) |> 
  slice_max(gamma,n=1) |> 
  select(topic,LeadROName,Department,Title)

Just by looking at department of research topic, it seems that some of the topic perhaps not as coherent as we hopped. It is possible that we need less topic than 10 perhaps.

Lets do another sample

Code
res_reps |> 
  group_by(topic) |> 
  slice_sample(n=1) |> 
  select(topic,LeadROName,Department,Title)

So actually this maybe doing alright. Lets analysis department fields.

Code
res_reps |> 
  select(topic,LeadROName,Department,Title,id) |> 
  unnest_tokens(word,Department) |> 
  anti_join(stop_words,"word") |> 
  # filter(!word %in% c("sch","school",'science')) |> 
  count(id,word,topic,sort=T) |> 
  bind_tf_idf(word,id,n) |>
  group_by(topic) |> 
  slice_max(tf_idf,n=10) |> 
  ggplot(aes(x = tf_idf, y = reorder_within(word, tf_idf, topic) )) + 
  geom_col(fill="black") +
  scale_y_reordered() +
  facet_wrap(~ topic,scale="free_y") +
  theme_minimal() +
  ggtitle("Topic Versus Research Insititution Deparatment")

So 5 and 6 could be similar, 3 and 4 could stand one topic. But we cannot be too sure yet. So

Code
## this process also merge similarish topics
topic_guess = c(
  "AI & Public Wellfare",
  "Health & Medical",
  "Computer & Mathematic",
  
  "Applied AI",
  "Environmental Biology",
  "Cellor Biology",
  
  "AI & Public Wellfare",
  "AI & Public Wellfare",
  "Renewable Energy",
  "Material Science"
)

Interestingly, climate change is on longer on this. For continence, we put number 1,7,8 all under one umberalla “AI & Public Well-fare’ which

Appendix

Developing a Visualization for Topic Modeling**

I found bi-gram graph compensate traditional beta count graph. Instead of “reading tea leafs”, you can try read along the edge.

draft script
bi_gram = topic_binded |> 
  select(id,abstractText,topic) |> 
  unnest_tokens(pharse,abstractText,'ngrams',n=2) |> 
  separate(pharse, into=c("word1","word2"),sep=" ") |> 
  anti_join(stop_words, c("word1"="word")) |> 
  anti_join(stop_words, c("word2"="word")) |> 
  count(topic,word1,word2,sort=T)

typical_graph = bi_gram |> 
  filter(!is.na(word1) & !is.na(word2)) |>
  group_by(topic) |>
  slice_max(n,n=20) |> 
  relocate(word1,word2) |> 
  as_tbl_graph()

G = list()
for (i in c(2,10,8,9)) {
  g = typical_graph |> 
    activate(edges) |>
    filter(topic == i) |> 
    activate(nodes) |> 
    filter(!node_is_isolated())
  if(length(g)==0) next()
  plot_g = g |> 
    ggraph('kk') +
    geom_edge_link(aes(width=n, alpha=n),trans="log",color="royalblue") +
    geom_node_text(aes(label=name),repel = T) +
    geom_node_point(size=5) +
    theme_void() +
    ggtitle(paste("topic",i))
  G = append(G, list(plot_g))
}
ggpubr::ggarrange(plotlist=G,ncol=2, nrow=2)

List of all Topics From the Frist Extraction

plot all topics trained so far
G=map(1:10,~plot_topics(model=topics,
                        reps_tk=bi_gram_tok, 
                        reps_docs=typical_docs,
                        topic_ki=.x))
ggpubr::ggarrange(plotlist=G,nrow=10)

List of All Topic Extraction for Research Grant Only

The topic read

plot all topics trained so far
G=map(1:10,~plot_topics(model=res_lda,
                        reps_tk=res_bi_gram, 
                        reps_docs=res_reps_top,
                        topic_ki=.x,
                        color= "coral2"
                        ))
ggpubr::ggarrange(plotlist=G,nrow=10)